-------------------------------------------------------------------------------
      name:  <unnamed>
       log:  /afs/econ.duke.edu/data/vjh3/WageReturns/Data/y97/RawData/AFQT_MAT
> CHING_WK/AFQT_MATCHING_WK_with_weights.log
  log type:  text
 opened on:  14 Jan 2014, 13:38:40

. 
. /***************************************************
> MATCHING AFQT SCORES ACROSS NLSY 1979 and NLSY 1997
> 
> This do file creates a data file with comparable AFQT scores across both NLSY
> 79 and NLSY97.
> There are two main steps in creating the comparable AFQT scores:
> 
> 1. The 1979 ASVAB is a Paper and Pencil (P&P) test, while the 1997 ASVAB was 
> computer adminstered. 
> To make the scores comparable across cohorts, we rely on a percentile mapping
>  provided by Dan Segall (Segall (1997))
> 
> 2. The age at which respondents took the test differs between 1979 and 1997. 
> The 1997 sample is much younger.
> For both samples, we observe a large sample of individuals taking the test at
>  age 16. We use this overlap in the 
> test-taking age by mapping all test scores within cohorts into the age 16-dis
> tribution based on the within age 
> ranking of test scores. 
> 
> For details, see Segall (1997) and Altonji, Bharadwaj & Lange (2009).
> 
> Altonji, J., Bharadwaj, P. & Lange, F. "Changes in the Characteristics of Ame
> rican Youth - 
> Implications for Adult Outcomes" NBER Working Papers No. 13883, revised 2009.
> Segall, D. O. (1997). "Equating the CAT-ASVAB". In W. A. Sands, B. K. Waters,
>  & J. R. McBride (Eds.), 
>         Computerized adaptive testing: From inquiry to operation (pp. 181-198
> ). Washington, DC: American Psychological Association. 
> Date: August 19, 2009.
> ******************************************************/
. 
. // Set path to directory containing afqt1997a.csv //
. capture cd "/afs/econ.duke.edu/data/vjh3/WageReturns/Data/y97/RawData/AFQT_MA
> TCHING_WK"

. * cd "/Users/JKukkur/Documents/Research/ABL/AFQT MATCHING"
. 
. tempfile afqt97 afqt_append nlsy_agestd agestd_afqt missings finished_product

. 
. /****************************************************************
> First Step of score conversion: 
> Transfrom CAT Test Scores from NLSY97 into Paper and Pencil Test Scores using
>  mapping provided by Dan Segall.
> Combine this data with raw data from NLSY79.
> *****************************************************************/
. 
. 
. // afqt1997a.csv contains individual id's and sex from NLSY1997 and the ASVAB
>  component scores provided by Dan Segall. 
. // The are P&P equivalent scores based on the mapping procedure described in 
> Segall (1996). 
. // Dan Segall suuplied us with these P&P equivalent scores using ASVAB compon
> ent scores contained in (DICTIONARY FILE).
. insheet using afqt1997a.csv, comma
(36 vars, 8984 obs)

. ren v1 pid

. ren v2 male

. gen asvabWK=wk if wk!=0 // pc=paragraph comprehension, no=numerical comprehen
> sion 
(1874 missing values generated)

. keep pid male asvabWK

. sort pid

. save `afqt97', replace
(note: file /tmp/90666.1.all.q/St10537.000004 not found)
file /tmp/90666.1.all.q/St10537.000004 saved

. 
. // Merge age in for the NLSY97 sample //
. infile using age97.dct, clear

infile dictionary {
  R0000100 "PUBID - YTH ID CODE 1997"
  R1194100 "CV_AGE_INT_DATE 1997"
  R2553500 "CV_AGE_INT_DATE 1998"
  R3876300 "CV_AGE_INT_DATE 1999"
  R5453700 "CV_AGE_INT_DATE 2000"
  R7216000 "CV_AGE_INT_DATE 2001"
  S1531400 "CV_AGE_INT_DATE 2002"
  S2001000 "CV_AGE_INT_DATE 2003"
  S3801100 "CV_AGE_INT_DATE 2004"
}

(8984 observations read)

. ren R0000100    pid

. ren R1194100    age  // age as of 1997 (test-taking year for NLSY97) //

. keep pid age

. sort pid

. merge pid using `afqt97' 
(note: you are using old merge syntax; see [D] merge for new syntax)

. drop _merge 

. sort pid

. save `afqt97', replace
file /tmp/90666.1.all.q/St10537.000004 saved

. 
. // Merge in weights for 1997 data //
. // We use the custom weight provided by the NLSY for the year 1997, the year 
> when the ASVAB was administered. //
. insheet using weights97.csv, clear
(2 vars, 8984 obs)

. sort pid

. merge pid using `afqt97'
(note: you are using old merge syntax; see [D] merge for new syntax)
pid was int now float

. drop _merge 

. gen sample=1                            // Sample Identifier: 1= 1997 NLSY sa
> mple, 0=1979 NLSY sample //

. sort sample pid

. save `afqt97', replace
file /tmp/90666.1.all.q/St10537.000004 saved

. 
. // NLSY 1979 Sample: Age Information and AFQT-scores //
. // NLSY 1979 Sample: Age Information and AFQT-scores //
. infile using y79_asvab.dct, clear

infile dictionary {
  R0000100 "ID# (1-12686) 79"
  R0000500 "DATE OF BIRTH - YR 79"
  R0173600 "SAMPLE ID  79 INT"
  R0406510 "AGE OF R @ INT DATE 80"
  R0618010 "PROFILES ASVAB SEC 1-STD SCRNR 81"
  R0618011 "PROFILES ASVAB SEC 2-STD SCRNR 81"
  R0618012 "PROFILES ASVAB SEC 3-STD SCRNR 81"
  R0618013 "PROFILES ASVAB SEC 4-STD SCRNR 81"
  R0618014 "PROFILES ASVAB SEC 5-STD SCRNR 81"
  R0618015 "PROFILES ASVAB SEC 6-STD SCRNR 81"
  R0618016 "PROFILES ASVAB SEC 7-STD SCRNR 81"
  R0618017 "PROFILES ASVAB SEC 8-STD SCRNR 81"
  R0618018 "PROFILES ASVAB SEC 9-STD SCRNR 81"
  R0618019 "PROFILES ASVAB SEC 10-STD SCRNR 81"
  R0618100 "PROFILES ASVAB VRBL-STD SCRNR 81"
  R0618200 "PROFILES AFQT PRCTILE SCORE-80 81"
  R0618300 "PROFILES AFQT PRCTILE 89 (REV) 81"
  R0618301 "PROFILES AFQT PRCTILE 2006 (REV) 81"
}

(12686 observations read)

. ren R0000100 pid                

. ren R0000500 birthyear 

. ren R0173600 sampid

. ren R0406510 age                                                             
>            // Age as of 1980 (test taking year for NLSY79) //

. 
. ren R0618010 gs

. ren R0618011 ar

. ren R0618012 wk

. ren R0618013 pc

. ren R0618014 no

. ren R0618015 cs

. ren R0618016 as

. ren R0618017 mk

. ren R0618018 mc

. ren R0618019 ei

. 
. ren R0618100 asvabVerb

. ren R0618200 AFQT_1

. ren R0618300 AFQT_2

. ren R0618301 AFQT_3

. 
. foreach var in gs ar wk pc no cs as mk mc ei {
  2.         qui replace `var'=. if `var'<0
  3. }

. gen asvabWK = wk if wk!=0
(808 missing values generated)

. drop if asvabWK==.
(808 observations deleted)

. label var asvabWK "Arithmetic Reasoning subset of ASVAB score"

. replace age=80-birthyear if age<0 & birthyear!=.        // Fill missing age u
> sing birth-year // 
(202 real changes made)

. drop birthyear

. gen sample=0                                                    // Sample Ide
> ntifier: 1= 1997 NLSY sample, 0=1979 NLSY sample //

. append using `afqt97'                           // Append the data-set for NL
> SY97 //

. sort sample pid

. save `afqt_append', replace
(note: file /tmp/90666.1.all.q/St10537.000005 not found)
file /tmp/90666.1.all.q/St10537.000005 saved

. 
. * Merge in 1979 weights
. // We use the custom weight provided by the NLSY for the year 1979, the year 
> when the ASVAB was administered. //
. 
. insheet using weights79.csv, clear
(2 vars, 12686 obs)

. gen sample=0

. sort sample pid

. merge sample pid using `afqt_append'
(note: you are using old merge syntax; see [D] merge for new syntax)
pid was int now float

. drop _merge

. 
. * The weights have implied 2 decimal places.
. replace weight=weight/100
weight was long now double
(21670 real changes made)

. sort sample pid

. save `afqt_append', replace
file /tmp/90666.1.all.q/St10537.000005 saved

. 
. /***************************************************************************
> Second Step: Percentile mapping of P&P test scores into age=16 distribution
> ****************************************************************************/
. 
. *****           GENERATING PERCENTILES OF SCORES BY AGE AND NLSY-SAMPLE *****
> ****
. use `afqt_append', clear

. drop gs ar wk pc no cs as mk mc ei asvabVerb AFQT_1 AFQT_2 AFQT_3

. 
. // Drop those with missing AFQT WK scores
. drop if asvabWK==.
(2682 observations deleted)

. 
. // For Table I in Introduction.doc
. bysort sample: tab age

-------------------------------------------------------------------------------
-> sample = 0

 AGE OF R @ |
INT DATE 80 |      Freq.     Percent        Cum.
------------+-----------------------------------
         15 |        962        8.10        8.10
         16 |      1,511       12.72       20.82
         17 |      1,488       12.53       33.35
         18 |      1,432       12.06       45.40
         19 |      1,502       12.65       58.05
         20 |      1,558       13.12       71.17
         21 |      1,539       12.96       84.12
         22 |      1,529       12.87       96.99
         23 |        357        3.01      100.00
------------+-----------------------------------
      Total |     11,878      100.00

-------------------------------------------------------------------------------
-> sample = 1

 AGE OF R @ |
INT DATE 80 |      Freq.     Percent        Cum.
------------+-----------------------------------
         12 |        967       13.60       13.60
         13 |      1,411       19.85       33.45
         14 |      1,480       20.82       54.26
         15 |      1,493       21.00       75.26
         16 |      1,326       18.65       93.91
         17 |        430        6.05       99.96
         18 |          3        0.04      100.00
------------+-----------------------------------
      Total |      7,110      100.00


. 
. 
. // For Figure 1 in NSLY79
. kdensity asvabWK if sample==0&age==16, addplot(kdensity asvabWK if sample==1&
> age==16, lpattern(_)) title(Figure 1: AFQT Scores at Age 16) ///
> note("The NLSY79-scores are the P&P scores reported by the NLSY79." "The NLSY
> -97 scores are based on the CAT scores from NLSY97 and the equation by Segal 
> (1997)." "Both populations are weighted to be population representative.") //
> /
> legend(label(1 "NLSY 1979") label(2 "NLSY 1997") cols(2)) saving(HistScores, 
> replace)
(file HistScores.gph saved)

. 
. 
. // Combine sparsely populated age-groups in both samples with adjacent age-gr
> oups //
. replace age=22 if age==23 & sample==0
(357 real changes made)

. replace age=17 if age==18 & sample==1
(3 real changes made)

. 
. *******************************************************************
. *EXPANDING THE DATA AND CREATING PERCENTILES BY HAND
. // The following procedure improves the quality of the percentile mapping. 
. // The problem is that some observations 'belong' in several percentiles beca
> use they 
. // have large weights. Stata commands such as xtile will simply assign a uniq
> ue percentile to these
. // observations. Instead, we need to account for the fact that these observat
> ions belong to several pctiles. 
. // This is achieved by expanding the data-set proportionally to the weights a
> nd then generating percentiles. 
. 
. *expanding each observation by its weight - an observation with a weight of 1
> 100 is expanded into 11 observations. 
. gen percentile_rank=.
(18988 missing values generated)

. replace weight=round(weight/100)
(18988 real changes made)

. foreach num of numlist 0/160 {
  2.         qui expand `num' if weight==`num'
  3. }

. 
. *generating a unique rank within each sample and age
. bysort sample age: egen r=rank(asvabWK), u

. gen pasvabWK=.
(471151 missing values generated)

. 
. * Divide the rank by number of individuals corresponding to the population of
>  a given age and sample
. * to get the percentile of an individual.
. 
. gen holdey = 1/100

. bysort sample age: egen sumWgts=total(holdey)

. 
. *SAMPLE==0
. qui replace pasvabWK=round(r/sumWgts) if age==15 & sample==0

. qui replace pasvabWK=round(r/sumWgts) if age==16 & sample==0

. qui replace pasvabWK=round(r/sumWgts) if age==17 & sample==0

. qui replace pasvabWK=round(r/sumWgts) if age==18 & sample==0

. qui replace pasvabWK=round(r/sumWgts) if age==19 & sample==0

. qui replace pasvabWK=round(r/sumWgts) if age==20 & sample==0

. qui replace pasvabWK=round(r/sumWgts) if age==21 & sample==0

. qui replace pasvabWK=round(r/sumWgts) if age==22 & sample==0

. 
. *SAMPLE 1
. qui replace pasvabWK=round(r/sumWgts) if age==12 & sample==1

. qui replace pasvabWK=round(r/sumWgts) if age==13 & sample==1

. qui replace pasvabWK=round(r/sumWgts) if age==14 & sample==1

. qui replace pasvabWK=round(r/sumWgts) if age==15 & sample==1

. qui replace pasvabWK=round(r/sumWgts) if age==16 & sample==1

. qui replace pasvabWK=round(r/sumWgts) if age==17 & sample==1

. 
. *dropping the duplicates now
. egen tag=tag(sample pid)

. keep if tag==1
(452163 observations deleted)

. drop tag

. 
. sort sample pasvabWK

. save `nlsy_agestd', replace
(note: file /tmp/90666.1.all.q/St10537.000006 not found)
file /tmp/90666.1.all.q/St10537.000006 saved

. 
. *****************************************************************************
. 
. // Within sample, we map AFQT scores by age in the age=16
. // distribution. We therefore require mean AFQT-scores of age=16 by percentil
> e in each sample. 
. // We need to generate these averages using the weights.
. bys sample pasvabWK: egen pop=sum(weight) if age==16
(16151 missing values generated)

. bys sample pasvabWK: egen tot_score=sum(asvabWK*weight) if age==16      // Me
> an age=16 raw score for each percentile //
(16151 missing values generated)

. bys sample pasvabWK: gen mean=tot_score/pop if age==16
(16151 missing values generated)

. drop tot_score pop

. egen tag=tag(sample pasvabWK mean) if age==16                   // We need on
> ly 1 obs per sample and percentile //

. keep if tag==1
(18786 observations deleted)

. ren mean asvabWK_std

. keep sample asvabWK_std pasvabWK asvabWK

. sort sample pasvabWK

. save `agestd_afqt', replace
(note: file /tmp/90666.1.all.q/St10537.000007 not found)
file /tmp/90666.1.all.q/St10537.000007 saved

. 
. use `nlsy_agestd', clear

. merge sample pasvabWK using `agestd_afqt' // Merge into each percentile the a
> ge=16 corresponding score //
(note: you are using old merge syntax; see [D] merge for new syntax)
variables sample pasvabWK do not uniquely identify observations in the master
    data

. drop _merge

. keep sample pid male pasvabWK asvabWK_std asvabWK weight age

. sort sample pid

. // The final data contains "asvabWK_std": which is the comparable score acros
> s the 2 NLSY samples //
. *keep if sample==1 // this drops the NLSY79 observations //
. ren pid ID

. ren weight weight_altonji

. gen year = 1979 if sample==0
(7110 missing values generated)

. replace year = 1997 if sample==1 
(7110 real changes made)

. *drop sample age asvabWK pasvabWK // drop the other variables since I don't u
> se them in my research //
. preserve

.         use `afqt97', clear

.         keep pid male

.         ren pid ID

.         save `missings', replace
(note: file /tmp/90666.1.all.q/St10537.000008 not found)
file /tmp/90666.1.all.q/St10537.000008 saved

. restore

. 
. preserve

.         keep if sample==1
(11878 observations deleted)

.         merge 1:1 ID using `missings'

    Result                           # of obs.
    -----------------------------------------
    not matched                         1,874
        from master                         0  (_merge==1)
        from using                      1,874  (_merge==2)

    matched                             7,110  (_merge==3)
    -----------------------------------------

.         tab _merge

                 _merge |      Freq.     Percent        Cum.
------------------------+-----------------------------------
         using only (2) |      1,874       20.86       20.86
            matched (3) |      7,110       79.14      100.00
------------------------+-----------------------------------
                  Total |      8,984      100.00

.         zscore asvabWK_std
z_asvabWK_std created with 1874 missing values

.         drop asvabWK_std

.         ren z_asvabWK_std asvabWK_std

.         keep if male==1
(4385 observations deleted)

.         sort ID

.         outsheet ID asvabWK_std using altonjiAFQT.csv, comma nol replace

. restore

. save afqt_adjusted_final, replace 
file afqt_adjusted_final.dta saved

. save `finished_product', replace
(note: file /tmp/90666.1.all.q/St10537.000009 not found)
file /tmp/90666.1.all.q/St10537.000009 saved

. 
. tabstat asvabWK_std, by(sample) stats(n min max mean median)

Summary for variables: asvabWK_std
     by categories of: sample 

  sample |         N       min       max      mean       p50
---------+--------------------------------------------------
       0 |     11878        20        61  43.90701        44
       1 |      7110        20        61  46.16638        48
---------+--------------------------------------------------
   Total |     18988        20        61  44.75302  46.22794
------------------------------------------------------------

. 
. isid ID year

. 
. * Test: the afqt distributions across age within sample should now be identic
> al. There will still be very small deviations
. * because of the coarseness of the above expansion, but we believe these to b
> e third order. To allow this code to run 
. * rapidly, we tolerate these deviations. 
. 
. **bys sample age: sum asvabWK_std [fw=weight], d
. 
. use `finished_product', clear

. ** Generate a NLSY79-only supplement:
. drop if sample==1
(7110 observations deleted)

. save afqt_adjusted_final79, replace
file afqt_adjusted_final79.dta saved

. 
. use `finished_product', clear

. ** Generate a NLSY97-only supplement:
. drop if sample==0
(11878 observations deleted)

. save afqt_adjusted_final97, replace
file afqt_adjusted_final97.dta saved

. 
. log close
      name:  <unnamed>
       log:  /afs/econ.duke.edu/data/vjh3/WageReturns/Data/y97/RawData/AFQT_MAT
> CHING_WK/AFQT_MATCHING_WK_with_weights.log
  log type:  text
 closed on:  14 Jan 2014, 13:38:50
-------------------------------------------------------------------------------
